Load libraries and set filepaths

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(TxDb.Hsapiens.UCSC.hg38.knownGene))
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38))
suppressPackageStartupMessages(library(org.Hs.eg.db))
suppressPackageStartupMessages(library(VariantAnnotation))
db_path <- "./input"
output_dir <- "./output"
# ftp_path <- "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"

clinvar_EGFR_path <-
  file.path(db_path, "variant_summary_GRCh38_EGFR.tsv")

cosmic_path <-
  file.path(db_path, "Cosmic_GenomeScreensMutant_v99_GRCh38_EGFR.tsv")

ABE8e_path <-
  file.path(db_path, "sgrna_designs_ABE8e.csv")
BE4max_path <-
  file.path(db_path, "sgrna_designs_BE4max.csv")
syn_codon_path <-
  file.path(db_path, "syn_codons.txt")

variants_table_path <-
  file.path(output_dir, "variants_table.tsv")

all_variants_paired_path <-  
  file.path(output_dir, "all_variants_paired.tsv")

Extract clinvar data and save to file for later processing

# # run once
# clinvar_all_path <-
#   file.path(db_path, "variant_summary.txt.gz")
#
#
# clinvar_all <- read_tsv(gzfile(clinvar_all_path))
#
# clinvar_EGFR <-
#   clinvar_all %>%
#   filter(GeneSymbol == "EGFR" &
#     Assembly == "GRCh38" &
#     !(Type %in% c("Microsatellite", "Inversion")) &
#     Start >= 55019017)
# write_tsv(clinvar_EGFR, clinvar_EGFR_path)

Create Gene model for EGFR-201 (ENST00000275493.7)

hg_genome <- BSgenome.Hsapiens.UCSC.hg38
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
txdb <- keepSeqlevels(txdb, "chr7")
tx_lengths <-
  transcriptLengths(txdb,
    with.utr5_len = TRUE,
    with.utr3_len = TRUE
  )

tx_lengths <-
  tx_lengths %>%
  dplyr::filter(gene_id == 1956) %>%
  arrange(-nexon)
tx_lengths
(tx_lengths$tx_len - (tx_lengths$utr5_len + tx_lengths$utr3_len)) / 3
 [1] 1211.0000 1158.0000 1092.0000  706.0000  629.0000  406.0000  721.6667  344.6667 1275.6667  567.0000  221.6667  802.3333  187.0000

Get the cds parts, the introns, the 3UTR, and the 5UTR and combine

cds_tx <- cdsBy(txdb, by = "tx", use.names = TRUE)
cds_tx <- cds_tx["ENST00000275493.7"]

intbytx <- intronsByTranscript(txdb, use.names = TRUE)
intbytx <- intbytx["ENST00000275493.7"]


five_utr <- fiveUTRsByTranscript(txdb, use.names = TRUE)$ENST00000275493.7
three_utr <- threeUTRsByTranscript(txdb, use.names = TRUE)$ENST00000275493.7
gene_model_gr <-
  c(five_utr, cds_tx$ENST00000275493.7, intbytx$ENST00000275493.7, three_utr)
gene_model_gr <- sort(gene_model_gr)
id_col <- mcols(gene_model_gr)$cds_id
id_col[1] <- "5utr_247279"
id_col[57] <- "3utr_247330"
id_col[seq(2, 56, 2)] <- paste0("cds", 1:28, "_", id_col[seq(2, 56, 2)])
id_col[seq(3, 55, 2)] <- paste0("intron", 1:27)
mcols(gene_model_gr) <- DataFrame(id = id_col)
cds_seqs <- extractTranscriptSeqs(hg_genome, as(gene_model_gr, "GRangesList"))
mcols(gene_model_gr)$seq <- cds_seqs
gene_model_gr
GRanges object with 57 ranges and 2 metadata columns:
       seqnames            ranges strand |           id                     seq
          <Rle>         <IRanges>  <Rle> |  <character>          <DNAStringSet>
   [1]     chr7 55019017-55019277      + |  5utr_247279 AGACGTCCGG...GGGAGCAGCG
   [2]     chr7 55019278-55019365      + |  cds1_101519 ATGCGACCCT...GAAAAGAAAG
   [3]     chr7 55019366-55142285      + |      intron1 GTAAGGGCGT...TTTCTTCCAG
   [4]     chr7 55142286-55142437      + |  cds2_101520 TTTGCCAAGG...CTTCTTAAAG
   [5]     chr7 55142438-55143304      + |      intron2 GTTGGTGACT...CTCTTCTTAG
   ...      ...               ...    ... .          ...                     ...
  [53]     chr7 55201783-55202516      + |     intron26 GTATGTATGA...CCTCCTGCAG
  [54]     chr7 55202517-55202625      + | cds27_101551 CTGCAAAGCT...CCAGTGCCTG
  [55]     chr7 55202626-55205255      + |     intron27 GTGAGTGGCT...CCACTTTCAG
  [56]     chr7 55205256-55205617      + | cds28_101553 AATACATAAA...TGGAGCATGA
  [57]     chr7 55205618-55211628      + |  3utr_247330 CCACGGAGGA...TTGTTAACAA
  -------
  seqinfo: 1 sequence from hg38 genome
loc_lens <-
  tibble(loc = mcols(gene_model_gr)$id, loc_len = width(gene_model_gr))
loc_lens
EGFR_seq <- extractTranscriptSeqs(hg_genome, cds_tx)
EGFR_seq
DNAStringSet object of length 1:
    width seq                                                                                                                                                                                              names               
[1]  3633 ATGCGACCCTCCGGGACGGCCGGGGCAGCGCTCCTGGCGCTGCTGGCTGCGCTCTGCCCGGCGAGTCGGGCTCTGGAGGAAAAGAAAGTTTGCCA...AGCCAAGCCAAATGGCATCTTTAAGGGCTCCACAGCTGAAAATGCAGAATACCTAAGGGTCGCGCCACAAAGCAGTGAATTTATTGGAGCATGA ENST00000275493.7
EGFR_aa <- translate(EGFR_seq)
EGFR_aa
AAStringSet object of length 1:
    width seq                                                                                                                                                                                              names               
[1]  1211 MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYVQRNYDLSFLKTIQEVAGYVLIALNT...APSRDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGSTAENAEYLRVAPQSSEFIGA* ENST00000275493.7
cds_starts <- start(cds_tx)
cds_ends <- end(cds_tx)
cds_width <-
  transcriptWidths(
    exonStarts = cds_starts,
    exonEnds = cds_ends
  )
ref_locs <- transcriptLocs2refLocs(list(c(1:cds_width)),
  exonStarts = cds_starts,
  exonEnds = cds_ends,
  strand = c("+")
)[[1]]
ref_locs[1:10]
 [1] 55019278 55019279 55019280 55019281 55019282 55019283 55019284 55019285 55019286 55019287

Load, combine, and filter variant data

Load clinvar data

clinvar_hg38_EGFR <-
  read_tsv(clinvar_EGFR_path)
Rows: 2444 Columns: 34── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (23): Type, Name, GeneSymbol, HGNC_ID, ClinicalSignificance, LastEvaluated, nsv/esv (dbVar), RCVaccession, PhenotypeIDS, PhenotypeList, Origin, OriginSimple, Assembly, ChromosomeAccession, ReferenceAllele, Alternate...
dbl (11): #AlleleID, GeneID, ClinSigSimple, RS# (dbSNP), Chromosome, Start, Stop, NumberSubmitters, SubmitterCategories, VariationID, PositionVCF
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clinvar_hg38_EGFR <-
  clinvar_hg38_EGFR %>%
  dplyr::select(
    -GeneID, -GeneSymbol, -HGNC_ID, -`nsv/esv (dbVar)`,
    -Assembly, -ChromosomeAccession, -ReferenceAllele,
    -AlternateAllele, -Cytogenetic
  ) %>%
  mutate(var_len = pmax(
    str_length(ReferenceAlleleVCF),
    str_length(AlternateAlleleVCF)
  )) %>%
  filter(var_len <= 10) %>%
  arrange(PositionVCF)

# only keep main transcript variants
clinvar_hg38_EGFR <-
  clinvar_hg38_EGFR %>%
  filter(Name != "NM_001346941.2(EGFR):c.89-4536_89-4529del")
clinvar_hg38_EGFR
clinvar_hg38_EGFR_renamed <-
  clinvar_hg38_EGFR %>%
  dplyr::select(
    Name, Start, Stop,
    ReferenceAlleleVCF,
    AlternateAlleleVCF,
    VariationID
  ) %>%
  set_names(c(
    "name", "start", "stop",
    "ref", "alt", "clinvar_id"
  )) %>%
  mutate(name = str_replace(
    name, "NM_005228.5\\(EGFR\\):",
    paste0("g", start, "_")
  )) %>%
  mutate(clinvar_id = as.character(clinvar_id))
clinvar_hg38_EGFR_renamed
clinvar_hg38_EGFR_not_snv <-
  clinvar_hg38_EGFR_renamed %>%
  filter(!(str_length(ref) == 1 &
    str_length(alt) == 1)) %>%
  mutate(type = case_when(
    (str_length(ref) == str_length(alt) &
      str_length(ref) == 1) ~ "snv",
    (str_sub(ref, 1, 1) == alt &
      str_length(alt) < str_length(ref)) ~ "del",
    (str_sub(alt, 1, 1) == ref &
      str_length(ref) == 1 &
      str_length(alt) > 1) ~ "ins",
    TRUE ~ "indel"
  )) %>%
  mutate(stop = if_else(type == "ins", stop - 1, stop)) %>%
  mutate(
    alt = if_else(type == "del", "", alt),
    ref = if_else(type == "del", str_sub(ref, 2), ref)
  ) %>%
  rename(clinvar_id = "id") %>%
  relocate(type, .after = "name")
clinvar_hg38_EGFR_not_snv
clinvar_hg38_EGFR_snv <-
  clinvar_hg38_EGFR_renamed %>%
  filter((str_length(ref) == 1 & str_length(alt) == 1))
clinvar_hg38_EGFR_snv

Load COSMIC data

cosmic_dt <-
  read_tsv(cosmic_path)
Rows: 15654 Columns: 26── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (20): GENE_SYMBOL, COSMIC_GENE_ID, TRANSCRIPT_ACCESSION, COSMIC_SAMPLE_ID, SAMPLE_NAME, COSMIC_PHENOTYPE_ID, GENOMIC_MUTATION_ID, LEGACY_MUTATION_ID, MUTATION_CDS, MUTATION_AA, MUTATION_DESCRIPTION, MUTATION_ZYGOSIT...
dbl  (5): MUTATION_ID, CHROMOSOME, GENOME_START, GENOME_STOP, PUBMED_PMID
lgl  (1): LOH
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cosmic_dt <-
  cosmic_dt %>%
  filter(TRANSCRIPT_ACCESSION == "ENST00000275493.6") %>%
  group_by(GENOMIC_MUTATION_ID) %>%
  dplyr::slice(1) %>%
  ungroup()

cosmic_dt_subset <-
  cosmic_dt %>%
  dplyr::select(
    MUTATION_CDS, MUTATION_AA,
    GENOME_START, GENOME_STOP,
    GENOMIC_WT_ALLELE, GENOMIC_MUT_ALLELE,
    GENOMIC_MUTATION_ID, MUTATION_DESCRIPTION
  ) %>%
  mutate(MUTATION_AA = str_replace(MUTATION_AA, "p.\\?", "")) %>%
  mutate(name = if_else(MUTATION_AA == "",
    paste0("g", GENOME_START, "_", MUTATION_CDS),
    paste0("g", GENOME_START, "_", MUTATION_CDS, " (", MUTATION_AA, ")")
  )) %>%
  dplyr::select(
    name, GENOME_START, GENOME_STOP,
    GENOMIC_WT_ALLELE, GENOMIC_MUT_ALLELE,
    GENOMIC_MUTATION_ID
  ) %>%
  set_names(c("name", "start", "stop", "ref", "alt", "cosmic_id")) %>%
  mutate(
    ref = if_else(is.na(ref), "", ref),
    alt = if_else(is.na(alt), "", alt)
  ) %>%
  mutate(var_len = pmax(str_length(ref), str_length(alt))) %>%
  filter(var_len <= 10) %>%
  dplyr::select(-var_len) %>%
  arrange(start)



cosmic_dt_subset_snv <-
  cosmic_dt_subset %>%
  filter(str_length(ref) == 1 & str_length(alt) == 1)

cosmic_dt_subset_not_snv <-
  cosmic_dt_subset %>%
  filter(!(str_length(ref) == 1 & str_length(alt) == 1))

cosmic_dt_subset_snv
cosmic_dt_subset_not_snv <-
  cosmic_dt_subset_not_snv %>%
  mutate(type = case_when(
    (str_length(ref) == str_length(alt) & str_length(ref) == 1) ~ "snv",
    (str_length(ref) >= 1 & str_length(alt) == 0) ~ "del",
    (str_length(ref) == 0 & str_length(alt) >= 1) ~ "ins",
    TRUE ~ "indel"
  )) %>%
  relocate(type, .after = name)


cosmic_dt_subset_not_snv <-
  cosmic_dt_subset_not_snv %>%
  mutate(extra_base = map_chr(
    cosmic_dt_subset_not_snv$start,
    ~ as.character(subseq(hg_genome[["chr7"]], ., .))
  )) %>%
  mutate(
    stop = if_else(type == "ins", stop - 1, stop),
    ref = if_else(type == "ins", extra_base, ref),
    alt = if_else(type == "ins", paste0(extra_base, alt), alt)
  ) %>%
  dplyr::select(-extra_base)
cosmic_dt_subset_not_snv
snv_dt <-
  full_join(cosmic_dt_subset_snv,
    clinvar_hg38_EGFR_snv,
    by = c("start", "stop", "ref", "alt")
  ) %>%
  mutate(name = if_else(is.na(name.x), name.y, name.x)) %>%
  dplyr::select(
    name, start, stop,
    ref, alt, cosmic_id,
    clinvar_id
  ) %>%
  arrange(start) %>%
  mutate(name = gsub("_c\\.\\d+[+-]?\\d+([[:alpha:]])>", "_\\1>", name))
snv_dt

Load variants from base editor experiments

ABE8e_all_dt <- read_csv(ABE8e_path)
New names:Rows: 834 Columns: 29── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (21): sgRNA.sequence, sgRNA.context.sequence, Gene.Symbol, Ensembl.Gene.ID, Ensembl.transcript.ID, Genome.assembly, Transcript.reference.allele, Transcript.alternate.allele, Genome.reference.allele, Genome.alternate...
dbl  (8): ...1, Gene.strand, Chromosome, sgrna.genomic.position, X..edits, X.silent.edits, Amino.acid.position_simplified, guide_codon_pos
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
BE4max_all_dt <- read_csv(BE4max_path)
New names:Rows: 834 Columns: 29── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (21): sgRNA.sequence, sgRNA.context.sequence, Gene.Symbol, Ensembl.Gene.ID, Ensembl.transcript.ID, Genome.assembly, Transcript.reference.allele, Transcript.alternate.allele, Genome.reference.allele, Genome.alternate...
dbl  (8): ...1, Gene.strand, Chromosome, sgrna.genomic.position, X..edits, X.silent.edits, Amino.acid.position_simplified, guide_codon_pos
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ABE8e_dt <- ABE8e_all_dt %>%
  dplyr::select(c(14, 15, 17, 20, 27))
BE4max_dt <- BE4max_all_dt %>%
  dplyr::select(c(14, 15, 17, 20, 27))
base_editor_variants <-
  bind_rows(BE4max_dt, ABE8e_dt) %>%
  dplyr::filter(Mutation.category_simplified %in%
    c("Nonsense", "Missense", "Splice-acceptor", "Splice-donor"))
base_editor_variants <-
  base_editor_variants %>%
  separate_rows(Nucleotide.edits, sep = "C_") %>%
  separate_rows(Nucleotide.edits, sep = "A_") %>%
  separate_rows(Nucleotide.edits, sep = "_") %>%
  separate_rows(Nucleotide.edits, sep = ";") %>%
  dplyr::filter(Nucleotide.edits != "") %>%
  mutate(Nucleotide.edits = as.integer(Nucleotide.edits)) %>%
  mutate(start = if_else(sgRNA.Strand == "sense",
    sgrna.genomic.position + (Nucleotide.edits - 1),
    sgrna.genomic.position - (Nucleotide.edits - 1)
  )) %>%
  mutate(
    edit =
      case_when(
        Edit == "C-T" & sgRNA.Strand == "antisense" ~ "G-A",
        Edit == "A-G" & sgRNA.Strand == "antisense" ~ "T-C",
        TRUE ~ Edit
      )
  ) %>%
  dplyr::select(start, edit) %>%
  group_by(start) %>%
  dplyr::slice(1) %>%
  tidyr::separate(edit,
    into = c("ref", "alt"), sep = "-"
  ) %>%
  mutate(
    stop = start,
    name = paste0("g", start, "_", ref, ">", alt)
  ) %>%
  dplyr::select(c("name", "start", "stop", "ref", "alt"))
base_editor_variants

Combine variants data

snv_dt <-
  full_join(snv_dt,
    base_editor_variants,
    by = c("start", "stop", "ref", "alt")
  )

snv_dt <-
  snv_dt %>%
  mutate(name = if_else(is.na(name.x), name.y, name.x)) %>%
  rename(name.y = "be_id") %>%
  dplyr::select(
    name, start, stop, ref,
    alt, cosmic_id, clinvar_id, be_id
  ) %>%
  unite("id", cosmic_id:be_id, sep = ";", na.rm = TRUE) %>%
  arrange(start) %>%
  mutate(type = "snv")
snv_dt
non_snv_dt <-
  full_join(clinvar_hg38_EGFR_not_snv,
    cosmic_dt_subset_not_snv,
    by = c("start", "stop", "ref", "alt", "type")
  ) %>%
  mutate(name = if_else(is.na(name.x), name.y, name.x)) %>%
  dplyr::select(
    name, type, start, stop,
    ref, alt, cosmic_id,
    id
  ) %>%
  arrange(start) %>%
  unite("id", cosmic_id:id, sep = ";", na.rm = TRUE)
non_snv_dt
variants_dt <-
  bind_rows(
    snv_dt,
    non_snv_dt
  ) %>%
  arrange(start) %>%
  mutate(name = sub(" \\(p\\..*$", "", name))

variants_dt

Annotate Variants

annotate_and_predict_variants <- function(variants_dt) {
  seq_len_min <- 49

  num_variants <- NROW(variants_dt)
  gr0 <-
    GRanges(
      Rle(c("chr7"), num_variants),
      IRanges(
        start = variants_dt$start,
        end = variants_dt$stop,
        names = variants_dt$name
      )
    )

  var_allelles <- DNAStringSet(variants_dt$alt)

  intron_locations <-
    locateVariants(gr0,
      intbytx,
      IntronVariants(),
      varAllele = var_allelles
    )
  # Locate intronic variants
  intron_locations <-
    intron_locations %>%
    as_tibble() %>%
    add_column(name = names(intron_locations)) %>%
    relocate(name, .before = "seqnames") %>%
    dplyr::select(-PRECEDEID, -FOLLOWID, -TXID)

  # Locate remaining variants
  all <- locateVariants(gr0,
    txdb,
    AllVariants(),
    varAllele = var_allelles
  )
  all <-
    all %>%
    as_tibble() %>%
    add_column(name = names(all)) %>%
    relocate(name, .before = "seqnames") %>%
    dplyr::select(-PRECEDEID, -FOLLOWID)

  # Flag intronic and 5UTR variants that have alternative 
  # effects given other potential spliced transcripts

  transcript_splice_sites <-
    all %>%
    dplyr::filter(LOCATION == "spliceSite" & TXID == "93502")

  intron_locations <-
    intron_locations %>%
    filter(!(name %in% transcript_splice_sites$name))

  uncertain_introns <-
    all %>%
    dplyr::filter(name %in% intron_locations$name) %>%
    group_by(name, LOCATION) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    group_by(name) %>%
    mutate(frac_intron = sum(LOCATION == "intron") / n()) %>%
    dplyr::filter(frac_intron < 1) %>%
    dplyr::filter(LOCATION != "intron") %>%
    dplyr::filter(TXID != 93502) %>%
    arrange(QUERYID) %>%
    dplyr::select(-GENEID, -frac_intron) %>%
    ungroup()

  flag_variants <-
    uncertain_introns %>%
    dplyr::select(name, LOCATION, TXID) %>%
    left_join(., tx_lengths %>%
      dplyr::select(tx_id, tx_name) %>%
      mutate(tx_id = as.character(tx_id)) %>%
      dplyr::rename(TXID = "tx_id"),
    by = "TXID"
    ) %>%
    dplyr::select(-TXID) %>%
    unite("alt_effect", LOCATION:tx_name)


  all <-
    all %>%
    dplyr::filter(TXID == 93502) %>%
    dplyr::select(-TXID)

  potential_promoters <-
    all[which(duplicated(all$name)), ] %>%
    dplyr::select(name) %>%
    mutate(alt_effect = "promoter_ENST00000275493.7")

  flag_variants <-
    bind_rows(flag_variants, potential_promoters)

  flag_variants <-
    flag_variants %>%
    group_by(name) %>%
    dplyr::slice(1) %>%
    ungroup()


  all <-
    all %>%
    dplyr::filter(!duplicated(all$name))


  all <-
    bind_rows(
      all,
      intron_locations
    )

  all <-
    left_join(all, flag_variants, by = "name") %>%
    dplyr::select(-GENEID)


  all <-
    all %>%
    dplyr::select(-c(LOCSTART:CDSID))

  gene_model_gr_list <-
    as(gene_model_gr, "GRangesList")

  names(gene_model_gr_list) <- mcols(gene_model_gr)$id

  map_grs <-
    mapToTranscripts(resize(gr0, 1), gene_model_gr_list)

  map_dt <-
    map_grs %>%
    as_tibble() %>%
    mutate(name = names(map_grs)) %>%
    dplyr::select(name, seqnames, start) %>%
    set_names(c("name", "loc", "loc_start")) %>%
    left_join(., loc_lens, by = "loc")

  all <-
    left_join(all,
      map_dt,
      by = "name"
    )

  # Predict consequence of variant
  predict_coding_variants <-
    predictCoding(gr0, txdb, hg_genome, varAllele = var_allelles)
  predict_coding_variants <-
    predict_coding_variants[mcols(predict_coding_variants)$TXID == 93502]
  predict_coding_variants <-
    as_tibble(predict_coding_variants) %>%
    add_column(name = names(predict_coding_variants)) %>%
    dplyr::select(
      name, CDSLOC.start, CONSEQUENCE, REFCODON,
      VARCODON, REFAA, VARAA, PROTEINLOC
    )

  n_distinct(all$name)

  all <-
    left_join(all, predict_coding_variants, by = "name")



  all <-
    all %>%
    mutate(loc_end_comp = loc_len - (loc_start + width - 1)) %>%
    mutate(boundary_flag = (LOCATION == "coding" &
      (loc_start < seq_len_min | loc_end_comp < seq_len_min)))

  all <-
    all %>%
    dplyr::select(
      name, CONSEQUENCE, LOCATION, loc, loc_len, loc_start,
      CDSLOC.start, PROTEINLOC, REFCODON, VARCODON,
      REFAA, VARAA, boundary_flag, alt_effect
    ) %>%
    set_names(c(
      "name", "consequence", "location", "region", "region_len",
      "pos_region", "pos_cds", "pos_protein", "ref_codon",
      "var_codon", "ref_aa", "var_aa", "boundary_flag", "alt_effect"
    )) %>%
    rowwise() %>%
    mutate(pos_protein = str_c(pos_protein, collapse = ":"))

  return(all)
}
all <-
  annotate_and_predict_variants(variants_dt)
Warning: GRanges object contains 20914 out-of-bound ranges located on sequences 93501, 93502, 93503, 93504, 93505, 93509, and 93510. Note that ranges located on a sequence whose length is unknown (NA) or on a
  circular sequence are not considered out-of-bound (use seqlengths() and isCircular() to get the lengths and circularity flags of the underlying sequences). You can use trim() to trim these ranges.
  See ?`trim,GenomicRanges-method` for more information.'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
Warning: GRanges object contains 20914 out-of-bound ranges located on sequences 93501, 93502, 93503, 93504, 93505, 93509, and 93510. Note that ranges located on a sequence whose length is unknown (NA) or on a
  circular sequence are not considered out-of-bound (use seqlengths() and isCircular() to get the lengths and circularity flags of the underlying sequences). You can use trim() to trim these ranges.
  See ?`trim,GenomicRanges-method` for more information.Warning: records with missing 'varAllele' were ignoredWarning: in 'x[[12817]]': last 2 bases were ignoredWarning: in 'x[[12817]]': last 2 bases were ignored

Include base editing variants generated by two or more edits

min_base_changes <- function(target_amino_acid, ref_codon) {
  # Define a mapping of amino acids to codons
  amino_acid_mapping <- list(
    "A" = c("GCT", "GCC", "GCA", "GCG"),
    "R" = c("CGT", "CGC", "CGA", "CGG", "AGA", "AGG"),
    "N" = c("AAT", "AAC"),
    "D" = c("GAT", "GAC"),
    "C" = c("TGT", "TGC"),
    "Q" = c("CAA", "CAG"),
    "E" = c("GAA", "GAG"),
    "G" = c("GGT", "GGC", "GGA", "GGG"),
    "H" = c("CAT", "CAC"),
    "I" = c("ATA"), # "ATT", "ATC", two A>Ts
    "L" = c("TTA", "TTG", "CTT", "CTC", "CTA", "CTG"),
    "K" = c("AAA", "AAG"),
    "M" = c("ATG"),
    "F" = c("TTT", "TTC"),
    "P" = c("CCT", "CCC", "CCA", "CCG"),
    "S" = c("TCT", "TCC", "TCA", "TCG", "AGT", "AGC"),
    "T" = c("ACT", "ACC", "ACA", "ACG"),
    "W" = c("TGG"),
    "Y" = c("TAT", "TAC"),
    "V" = c("GTT", "GTC", "GTA", "GTG"),
    "*" = c("TAA", "TAG", "TGA")
  )

  # Find the reference amino acid corresponding to the reference codon
  ref_amino_acid <-
    names(amino_acid_mapping)[which(ref_codon %in% unlist(amino_acid_mapping))]

  # Check if the reference amino acid is valid
  if (is.null(ref_amino_acid)) {
    stop("Invalid reference codon.")
  }

  # Check if the target amino acid is valid
  if (!(target_amino_acid %in% names(amino_acid_mapping))) {
    stop("Invalid target amino acid.")
  }

  # Find the target codons for the given amino acid
  target_codons <- amino_acid_mapping[[target_amino_acid]]

  # Calculate the number of base changes needed for each target codon
  base_changes <-
    sapply(
      target_codons,
      function(tc) sum(strsplit(ref_codon, "")[[1]] != strsplit(tc, "")[[1]])
    )

  # Find the index of the minimum base changes
  min_index <-
    which.min(base_changes)

  # Return the variant codon with minimum base changes
  return(target_codons[min_index])
}

get_codon_at_aa_position <- function(pos) {
  start_pos <- pos * 3 - 2
  end_pos <- pos * 3
  as.character(subseq(EGFR_seq, start_pos, end_pos))
}


ABE8e_aa_var <-
  ABE8e_all_dt %>%
  dplyr::select(Amino.acid.edits) %>%
  separate_rows(Amino.acid.edits, sep = ";") %>%
  dplyr::filter(!(Amino.acid.edits %in% c("utr", "None", ""))) %>%
  dplyr::filter(!str_detect(Amino.acid.edits, "Exon")) %>%
  separate_wider_regex(Amino.acid.edits,
    c(
      ref_aa = "^[A-Za-z]+",
      pos_protein = "[0-9]+",
      var_aa = "[A-Za-z]+$"
    ),
    cols_remove = F
  ) %>%
  dplyr::filter(ref_aa != var_aa)


BE4max_aa_var <-
  BE4max_all_dt %>%
  dplyr::select(Amino.acid.edits) %>%
  separate_rows(Amino.acid.edits, sep = ";") %>%
  dplyr::filter(!(Amino.acid.edits %in% c("utr", "None", ""))) %>%
  dplyr::filter(!str_detect(Amino.acid.edits, "Exon")) %>%
  separate_wider_regex(Amino.acid.edits,
    c(
      ref_aa = "^[A-Za-z]+",
      pos_protein = "[0-9]+",
      var_aa = "[A-Za-z]+$"
    ),
    cols_remove = F
  ) %>%
  dplyr::filter(ref_aa != var_aa)

aa_map <- c(AMINO_ACID_CODE, "*" = "Ter")
flipped_aa_map <- setNames(names(aa_map), aa_map)

be_var_dt <-
  bind_rows(ABE8e_aa_var, BE4max_aa_var) %>%
  group_by(Amino.acid.edits) %>%
  dplyr::slice(1) %>%
  ungroup() %>%
  mutate(
    ref_aa = flipped_aa_map[ref_aa],
    var_aa = flipped_aa_map[var_aa]
  )

variants_dt0 <-
  left_join(variants_dt, all, by = "name")

be_var_dt <-
  left_join(be_var_dt,
    variants_dt0 %>%
      filter(pos_protein != "" & type == "snv") %>%
      dplyr::select(name, ref_aa, pos_protein, var_aa),
    by = c("ref_aa", "pos_protein", "var_aa")
  ) %>%
  dplyr::filter(is.na(name)) %>%
  mutate(pos_protein = as.numeric(pos_protein)) %>%
  rowwise() %>%
  mutate(ref_codon = get_codon_at_aa_position(pos_protein))

be_var_dt <-
  be_var_dt %>%
  mutate(var_codon = min_base_changes(var_aa, ref_codon))

be_var_dt <-
  be_var_dt %>%
  mutate(
    last_base_diff =
      str_sub(ref_codon, 3, 3) != str_sub(var_codon, 3, 3)
  ) %>%
  mutate(
    ref = if_else(last_base_diff, ref_codon, str_sub(ref_codon, 1, 2)),
    alt = if_else(last_base_diff, var_codon, str_sub(var_codon, 1, 2)),
    start = ref_locs[pos_protein * 3 - 2],
    stop = if_else(last_base_diff, start + 2, start + 1)
  ) %>%
  mutate(name = paste0("g", start, "_", ref, ">", alt)) %>%
  mutate(
    id = name,
    type = "indel"
  ) %>%
  dplyr::select(c("name", "type", "start", "stop", "ref", "alt", "id"))
rm(variants_dt0)

# remove g55174776_TT>CC since it is the same as g55174776_c.2239_2240delinsCC
be_var_dt <-
  be_var_dt %>%
  filter(name != "g55174776_TT>CC")
be_var_dt

Include double base edits variants

variants_dt <-
  bind_rows(variants_dt, be_var_dt) %>%
  arrange(start)

Reannotate after inclusion of double base edits variants

all <-
  annotate_and_predict_variants(variants_dt)
Warning: GRanges object contains 21739 out-of-bound ranges located on sequences 93501, 93502, 93503, 93504, 93505, 93509, and 93510. Note that ranges located on a sequence whose length is unknown (NA) or on a
  circular sequence are not considered out-of-bound (use seqlengths() and isCircular() to get the lengths and circularity flags of the underlying sequences). You can use trim() to trim these ranges.
  See ?`trim,GenomicRanges-method` for more information.'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
'select()' returned many:1 mapping between keys and columns
Warning: GRanges object contains 21739 out-of-bound ranges located on sequences 93501, 93502, 93503, 93504, 93505, 93509, and 93510. Note that ranges located on a sequence whose length is unknown (NA) or on a
  circular sequence are not considered out-of-bound (use seqlengths() and isCircular() to get the lengths and circularity flags of the underlying sequences). You can use trim() to trim these ranges.
  See ?`trim,GenomicRanges-method` for more information.Warning: records with missing 'varAllele' were ignoredWarning: in 'x[[13259]]': last 2 bases were ignoredWarning: in 'x[[13259]]': last 2 bases were ignored
variants_dt <-
  left_join(variants_dt, all, by = "name")

variants_dt
table(variants_dt$location)

spliceSite     intron    fiveUTR   threeUTR     coding intergenic   promoter 
        65       1825          5         61       3060          0          0 
table(variants_dt$location, variants_dt$type)
            
              del indel  ins  snv
  spliceSite    1     0    2   62
  intron       66     3   38 1718
  fiveUTR       0     0    0    5
  threeUTR      5     0    3   53
  coding       27   127   35 2871
  intergenic    0     0    0    0
  promoter      0     0    0    0

Filter out introns

variants_dt <-
  variants_dt %>%
  filter(location != c("intron"))
variants_dt

Generate synonymous mutations near target Single Nucleotide Variants

# Function to split a string into consecutive triplets
split_into_triplets <- function(s) {
  n <- nchar(s)
  if (n %% 3 != 0) {
    warning("String length is not a multiple of 3. Padding with spaces.")
    s <- paste0(s, rep(" ", 3 - (n %% 3)))
  }
  matrix(strsplit(s, "")[[1]], ncol = 3, byrow = TRUE)
}

# Split the vector into triplets
triplets <- split_into_triplets(as.character(EGFR_seq))

# Create a dataframe
codon_map_df <-
  tibble(ref_codon = apply(triplets, 1, paste, collapse = "")) %>%
  mutate(pos_protein = row_number())
codon_map_df
syn_codon_map <-
  read_delim(syn_codon_path)
Rows: 59 Columns: 5── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: " "
chr (5): ref_aa, ref_codon, var_codon, ref, alt
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
syn_codon_map
codon_map_df <-
  left_join(codon_map_df, syn_codon_map, by = "ref_codon") %>%
  mutate(start = ref_locs[pos_protein * 3]) %>%
  filter(!is.na(ref_aa))
codon_map_df

Split SNVS into spliceSite coding, and intron variants

variants_snv <-
  variants_dt %>%
  filter(type == "snv")

variants_snv_coding <-
  variants_snv %>%
  filter(location %in% c("coding"))

Find nonsynomous mutations

get_codon_map <- function(start, name, num_idx) {
  dis <- abs(start - codon_map_df$start)
  dis[dis <= 2] <- 10000
  lowest_indices <- head(order(dis), num_idx)

  bp_dist <- codon_map_df$start[lowest_indices] - start

  dt <- tibble(name = name, dist = bp_dist)
  dt <- bind_cols(dt, codon_map_df[lowest_indices, ])
  return(dt)
}

coding_syn <-
  map2_df(variants_snv_coding$start,
    variants_snv_coding$name,
    get_codon_map,
    num_idx = 1
  ) %>%
  rename(name = "target_name") %>%
  mutate(name = paste0("g", start, "_", ref, ">", alt)) %>%
  dplyr::select(
    name, start, ref, alt, ref,
    target_name, pos_protein,
    ref_codon, var_codon, ref_aa, dist
  )
table(coding_syn$dist)

 -8  -7  -6  -5  -4  -3   3   4   5   6   7   8 
  1   4   4  54 910 955  38 845  53   1   4   2 
coding_syn <-
  coding_syn %>%
  dplyr::select(-dist) %>%
  group_by(name) %>%
  mutate(target_name = paste(target_name, collapse = ";")) %>%
  dplyr::slice(1)
coding_syn
## Previous code for including intronic variants!!

# variants_snv_intron <-
#   variants_snv %>%
#   filter(location %in% c("intron", "spliceSite"))

# mut_pos_dist <- 3
# intron_syn <-
#   variants_snv_intron %>%
#   dplyr::select(name, start, region_len, pos_region) %>%
#   mutate(pos_region_comp = region_len - pos_region) %>%
#   mutate(upstream = case_when(
#     pos_region_comp < 8 ~ F,
#     pos_region_comp >= 8 ~ T
#   )) %>%
#   mutate(
#     start_syn = if_else(upstream, start + mut_pos_dist, start - mut_pos_dist),
#     ref = map_chr(
#       start_syn,
#       ~ as.character(subseq(hg_genome[["chr7"]], ., .))
#     )
#   )
#

# dup_alternatives <-
#   intron_syn %>%
#   filter(start %in% intron_syn$start_syn) %>%
#   separate(name, into = c("edit", "alt0"), remove = F, sep = ">") %>%
#   separate(edit, into = c("loc", "ref0"), remove = F, sep = "_") %>%
#   dplyr::select(name, start, ref0, alt0) %>%
#   group_by(start) %>%
#   mutate(bases = paste(alt0, collapse = ",")) %>%
#   ungroup() %>%
#   mutate(bases = paste0(bases, ",", ref0)) %>%
#   rowwise() %>%
#   mutate(alt_2 = setdiff(c("A", "C", "T", "G"),
#                          str_split(bases, pattern = ",")[[1]])[1]) %>%
#   mutate(name_sym = paste0("g", start, "_", ref0, ">", alt_2)) %>%
#   dplyr::select(name, name_sym, alt_2)
# dup_alternatives

# intron_syn <-
#   intron_syn %>%
#   mutate(alt = case_when(
#     ref == "G" ~ "A",
#     ref == "A" ~ "G",
#     ref == "C" ~ "T",
#     ref == "T" ~ "C"
#   )) %>%
#   dplyr::select(name, start_syn, ref, alt) %>%
#   dplyr::rename(target_name = "name", start = "start_syn") %>%
#   mutate(name = paste0("g", start, "_", ref, ">", alt)) %>%
#   dplyr::select(name, start, ref, alt, target_name) %>%
#   group_by(name) %>%
#   mutate(target_name = paste(target_name, collapse = ";")) %>%
#   dplyr::slice(1)


# intron_syn <-
#   left_join(intron_syn,
#     dup_alternatives,
#     by = "name"
#   ) %>%
#   mutate(
#     name = if_else(is.na(name_sym), name, name_sym),
#     alt = if_else(is.na(alt_2), alt, alt_2)
#   ) %>%
#   dplyr::select(-name_sym, -alt_2)

# syn_variants <-
#   bind_rows(
#     coding_syn %>%
#       mutate(location = "coding"),
#     intron_syn %>%
#       mutate(
#         pos_protein = NA,
#         ref_codon = NA,
#         var_codon = NA,
#         ref_aa = NA,
#         location = "intron"
#       )
#   )
# syn_variants
syn_variants <-
  left_join(
    coding_syn %>%
      mutate(location = "coding"),
    variants_snv %>%
      dplyr::select(name, id),
    by = "name"
  ) %>%
  mutate(
    syn_in_target = name %in% variants_dt$name,
    num_targets = str_count(target_name, ";") + 1
  )
syn_variants
syn_target_map <-
  syn_variants %>%
  group_by(num_targets, syn_in_target) %>%
  tally()
syn_target_map
syn_variants_long <-
  syn_variants %>%
  separate_rows(target_name, sep = ";") %>%
  dplyr::select(
    target_name, name, start, ref,
    alt
  ) %>%
  set_names(c(
    "name", "name_syn", "start_syn",
    "ref_syn", "alt_syn"
  ))
syn_variants_long
variants_dt <-
  left_join(variants_dt, syn_variants_long, by = "name")
variants_dt
variants_dt <-
  variants_dt %>%
  mutate(target_in_syn = if_else(name %in% syn_variants$name, T, F)) %>%
  mutate(name_target_syn = paste0(name, ":", name_syn)) %>%
  mutate(
    clinvar = if_else(grepl("\\b\\d+\\b", id), T, F),
    cosmic = if_else(grepl("COSV", id), T, F),
    base_edit = if_else(grepl("g\\d+_[A-Z]+>[A-Z]+", id), T, F)
  ) %>%
  dplyr::select(
    name, id, ref, alt, ref_aa, var_aa, pos_protein,
    type, consequence, location, name_syn, name_target_syn,
    target_in_syn, alt_effect,
    clinvar, cosmic, base_edit, start, stop, start_syn, everything()
  )  %>%
  mutate(name_target_syn=if_else(is.na(name_syn), NA, name_target_syn)) 
variants_dt

Split variants into target, target_syn, and syn

variants_snv <-
  variants_dt %>%
  filter(!is.na(name_syn))  


target_syn_variants <-
  variants_snv  %>%
  dplyr::select(name_target_syn, name_syn, name) %>%
  set_names(c("target_name","name_syn", "name_target"))  %>%
  mutate( name_target_syn=NA,
          num_targets=NA,
          paired_targets_as_syn= NA, 
          paired_targets_syn_as_syn= NA) %>%
  dplyr::select(target_name, name_target, name_syn, 
                name_target_syn,
                num_targets, paired_targets_as_syn,
                paired_targets_syn_as_syn )
target_syn_variants
NA
syn_variants_dt <-
  variants_snv %>%
  dplyr::select(name_syn, name, name_target_syn) %>%
  set_names(c("target_name", "name_target", "name_target_syn" ))  %>%

  group_by(target_name)%>% 
  mutate(name_target = paste(name_target, collapse = ";"),
         name_target_syn = paste(name_target_syn, collapse = ";")) %>% 
  dplyr::slice(1)    %>% 
  mutate( 
          syn_in_target = target_name %in% variants_dt$name,
          num_targets = str_count(name_target, ";") + 1
  )


syn_variants_dt_filtered <-
  syn_variants_dt %>%
  mutate(target_in_syn=NA,
        name_syn=NA,
        paired_targets_as_syn= NA, 
        paired_targets_syn_as_syn= NA) %>%
  filter(!(syn_in_target))%>%
  dplyr::select(target_name, name_target, name_syn,
                name_target_syn,
                num_targets, 
                paired_targets_as_syn, 
                paired_targets_syn_as_syn )

syn_variants_in_target <-
    syn_variants_dt %>%
  filter((syn_in_target)) %>%
  dplyr::select(-syn_in_target) %>%
  rename(name_target="paired_targets_as_syn",
         name_target_syn="paired_targets_syn_as_syn")
syn_variants_dt_filtered
variants_dt <-
  left_join(variants_dt, 
          syn_variants_in_target %>%
  rename(target_name="name"),
  by="name")
variants_dt
target_variants <-
  variants_dt %>%
  dplyr::select(name, name_syn, name_target_syn, paired_targets_as_syn, paired_targets_syn_as_syn, num_targets) %>%
  set_names(c("target_name", "name_syn", "name_target_syn", "paired_targets_as_syn", "paired_targets_syn_as_syn", "num_targets"))  %>%
  mutate(name_target=NA)  %>%
  dplyr::select(target_name, name_target, name_syn,
                name_target_syn,
                num_targets, 
                paired_targets_as_syn, 
                paired_targets_syn_as_syn )
  
target_variants
all_variants_paired <- 
  bind_rows(target_variants, 
            target_syn_variants, 
            syn_variants_dt_filtered)
all_variants_paired 

Save to files

write_tsv(
  all_variants_paired,
  all_variants_paired_path
)

write_tsv(
  variants_dt, variants_table_path)
sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Zurich
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] VariantAnnotation_1.48.1                 Rsamtools_2.18.0                         SummarizedExperiment_1.32.0              MatrixGenerics_1.14.0                    matrixStats_1.2.0                       
 [6] org.Hs.eg.db_3.18.0                      BSgenome.Hsapiens.UCSC.hg38_1.4.5        BSgenome_1.70.1                          rtracklayer_1.62.0                       BiocIO_1.12.0                           
[11] Biostrings_2.70.1                        XVector_0.42.0                           TxDb.Hsapiens.UCSC.hg38.knownGene_3.18.0 GenomicFeatures_1.54.1                   AnnotationDbi_1.64.1                    
[16] Biobase_2.62.0                           GenomicRanges_1.54.1                     GenomeInfoDb_1.38.5                      IRanges_2.36.0                           S4Vectors_0.40.2                        
[21] BiocGenerics_0.48.1                      lubridate_1.9.3                          forcats_1.0.0                            stringr_1.5.1                            dplyr_1.1.4                             
[26] purrr_1.0.2                              readr_2.1.5                              tidyr_1.3.0                              tibble_3.2.1                             ggplot2_3.4.4                           
[31] tidyverse_2.0.0                         

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.0         blob_1.2.4               filelock_1.0.3           bitops_1.0-7             fastmap_1.1.1            RCurl_1.98-1.14          BiocFileCache_2.10.1     GenomicAlignments_1.38.1
 [9] XML_3.99-0.16            digest_0.6.34            timechange_0.2.0         lifecycle_1.0.4          KEGGREST_1.42.0          RSQLite_2.3.4            magrittr_2.0.3           compiler_4.3.2          
[17] rlang_1.1.3              progress_1.2.3           tools_4.3.2              utf8_1.2.4               yaml_2.3.8               knitr_1.45               S4Arrays_1.2.0           prettyunits_1.2.0       
[25] bit_4.0.5                curl_5.2.0               DelayedArray_0.28.0      xml2_1.3.6               abind_1.4-5              BiocParallel_1.36.0      withr_2.5.2              grid_4.3.2              
[33] fansi_1.0.6              colorspace_2.1-0         scales_1.3.0             biomaRt_2.58.0           cli_3.6.2                crayon_1.5.2             generics_0.1.3           rstudioapi_0.15.0       
[41] httr_1.4.7               tzdb_0.4.0               rjson_0.2.21             DBI_1.2.1                cachem_1.0.8             zlibbioc_1.48.0          parallel_4.3.2           restfulr_0.0.15         
[49] vctrs_0.6.5              Matrix_1.6-5             hms_1.1.3                bit64_4.0.5              glue_1.7.0               codetools_0.2-19         stringi_1.8.3            gtable_0.3.4            
[57] munsell_0.5.0            pillar_1.9.0             rappdirs_0.3.3           GenomeInfoDbData_1.2.11  R6_2.5.1                 dbplyr_2.4.0             vroom_1.6.5              lattice_0.22-5          
[65] png_0.1-8                memoise_2.0.1            SparseArray_1.2.3        xfun_0.41                pkgconfig_2.0.3         
---
title: "EGFR Prime Editing Library"
subtitle: "Part 1: Combine Variants and Create Table"
author: 
- name: Rick Farouni
date: '`r format(Sys.Date(), "%Y-%B-%d")`'
output:
  html_notebook:
    df_print: paged
    code_folding: show
    toc: no
    toc_float: 
      collapsed: false
      smooth_scroll: false
---



# Load libraries and set filepaths 


```{r}
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(TxDb.Hsapiens.UCSC.hg38.knownGene))
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38))
suppressPackageStartupMessages(library(org.Hs.eg.db))
suppressPackageStartupMessages(library(VariantAnnotation))
```


```{r}
db_path <- "./input"
output_dir <- "./output"
# ftp_path <- "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"

clinvar_EGFR_path <-
  file.path(db_path, "variant_summary_GRCh38_EGFR.tsv")

cosmic_path <-
  file.path(db_path, "Cosmic_GenomeScreensMutant_v99_GRCh38_EGFR.tsv")

ABE8e_path <-
  file.path(db_path, "sgrna_designs_ABE8e.csv")
BE4max_path <-
  file.path(db_path, "sgrna_designs_BE4max.csv")
syn_codon_path <-
  file.path(db_path, "syn_codons.txt")

variants_table_path <-
  file.path(output_dir, "variants_table.tsv")

all_variants_paired_path <-  
  file.path(output_dir, "all_variants_paired.tsv")
```

## Extract clinvar data and save to file for later processing

```{r}
# # run once
# clinvar_all_path <-
#   file.path(db_path, "variant_summary.txt.gz")
#
#
# clinvar_all <- read_tsv(gzfile(clinvar_all_path))
#
# clinvar_EGFR <-
#   clinvar_all %>%
#   filter(GeneSymbol == "EGFR" &
#     Assembly == "GRCh38" &
#     !(Type %in% c("Microsatellite", "Inversion")) &
#     Start >= 55019017)
# write_tsv(clinvar_EGFR, clinvar_EGFR_path)
```

# Create Gene model for EGFR-201 (ENST00000275493.7)

```{r}
hg_genome <- BSgenome.Hsapiens.UCSC.hg38
```

```{r}
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
txdb <- keepSeqlevels(txdb, "chr7")
```



```{r}
tx_lengths <-
  transcriptLengths(txdb,
    with.utr5_len = TRUE,
    with.utr3_len = TRUE
  )

tx_lengths <-
  tx_lengths %>%
  dplyr::filter(gene_id == 1956) %>%
  arrange(-nexon)
tx_lengths
```


```{r}
(tx_lengths$tx_len - (tx_lengths$utr5_len + tx_lengths$utr3_len)) / 3
```

### Get the cds parts, the introns, the 3UTR, and the 5UTR and combine


```{r}
cds_tx <- cdsBy(txdb, by = "tx", use.names = TRUE)
cds_tx <- cds_tx["ENST00000275493.7"]

intbytx <- intronsByTranscript(txdb, use.names = TRUE)
intbytx <- intbytx["ENST00000275493.7"]


five_utr <- fiveUTRsByTranscript(txdb, use.names = TRUE)$ENST00000275493.7
three_utr <- threeUTRsByTranscript(txdb, use.names = TRUE)$ENST00000275493.7
gene_model_gr <-
  c(five_utr, cds_tx$ENST00000275493.7, intbytx$ENST00000275493.7, three_utr)
gene_model_gr <- sort(gene_model_gr)
id_col <- mcols(gene_model_gr)$cds_id
id_col[1] <- "5utr_247279"
id_col[57] <- "3utr_247330"
id_col[seq(2, 56, 2)] <- paste0("cds", 1:28, "_", id_col[seq(2, 56, 2)])
id_col[seq(3, 55, 2)] <- paste0("intron", 1:27)
mcols(gene_model_gr) <- DataFrame(id = id_col)
cds_seqs <- extractTranscriptSeqs(hg_genome, as(gene_model_gr, "GRangesList"))
mcols(gene_model_gr)$seq <- cds_seqs
gene_model_gr
```


```{r}
loc_lens <-
  tibble(loc = mcols(gene_model_gr)$id, loc_len = width(gene_model_gr))
loc_lens
```



```{r}
EGFR_seq <- extractTranscriptSeqs(hg_genome, cds_tx)
EGFR_seq
```

```{r}
EGFR_aa <- translate(EGFR_seq)
EGFR_aa
```


```{r}
cds_starts <- start(cds_tx)
cds_ends <- end(cds_tx)
cds_width <-
  transcriptWidths(
    exonStarts = cds_starts,
    exonEnds = cds_ends
  )
ref_locs <- transcriptLocs2refLocs(list(c(1:cds_width)),
  exonStarts = cds_starts,
  exonEnds = cds_ends,
  strand = c("+")
)[[1]]
ref_locs[1:10]
```


# Load, combine, and filter variant data

## Load clinvar data

```{r}
clinvar_hg38_EGFR <-
  read_tsv(clinvar_EGFR_path)

clinvar_hg38_EGFR <-
  clinvar_hg38_EGFR %>%
  dplyr::select(
    -GeneID, -GeneSymbol, -HGNC_ID, -`nsv/esv (dbVar)`,
    -Assembly, -ChromosomeAccession, -ReferenceAllele,
    -AlternateAllele, -Cytogenetic
  ) %>%
  mutate(var_len = pmax(
    str_length(ReferenceAlleleVCF),
    str_length(AlternateAlleleVCF)
  )) %>%
  filter(var_len <= 10) %>%
  arrange(PositionVCF)

# only keep main transcript variants
clinvar_hg38_EGFR <-
  clinvar_hg38_EGFR %>%
  filter(Name != "NM_001346941.2(EGFR):c.89-4536_89-4529del")
clinvar_hg38_EGFR
```




```{r}
clinvar_hg38_EGFR_renamed <-
  clinvar_hg38_EGFR %>%
  dplyr::select(
    Name, Start, Stop,
    ReferenceAlleleVCF,
    AlternateAlleleVCF,
    VariationID
  ) %>%
  set_names(c(
    "name", "start", "stop",
    "ref", "alt", "clinvar_id"
  )) %>%
  mutate(name = str_replace(
    name, "NM_005228.5\\(EGFR\\):",
    paste0("g", start, "_")
  )) %>%
  mutate(clinvar_id = as.character(clinvar_id))
clinvar_hg38_EGFR_renamed
```

```{r}
clinvar_hg38_EGFR_not_snv <-
  clinvar_hg38_EGFR_renamed %>%
  filter(!(str_length(ref) == 1 &
    str_length(alt) == 1)) %>%
  mutate(type = case_when(
    (str_length(ref) == str_length(alt) &
      str_length(ref) == 1) ~ "snv",
    (str_sub(ref, 1, 1) == alt &
      str_length(alt) < str_length(ref)) ~ "del",
    (str_sub(alt, 1, 1) == ref &
      str_length(ref) == 1 &
      str_length(alt) > 1) ~ "ins",
    TRUE ~ "indel"
  )) %>%
  mutate(stop = if_else(type == "ins", stop - 1, stop)) %>%
  mutate(
    alt = if_else(type == "del", "", alt),
    ref = if_else(type == "del", str_sub(ref, 2), ref)
  ) %>%
  rename(clinvar_id = "id") %>%
  relocate(type, .after = "name")
clinvar_hg38_EGFR_not_snv
```

```{r}
clinvar_hg38_EGFR_snv <-
  clinvar_hg38_EGFR_renamed %>%
  filter((str_length(ref) == 1 & str_length(alt) == 1))
clinvar_hg38_EGFR_snv
```

## Load COSMIC data



```{r}
cosmic_dt <-
  read_tsv(cosmic_path)

cosmic_dt <-
  cosmic_dt %>%
  filter(TRANSCRIPT_ACCESSION == "ENST00000275493.6") %>%
  group_by(GENOMIC_MUTATION_ID) %>%
  dplyr::slice(1) %>%
  ungroup()

cosmic_dt_subset <-
  cosmic_dt %>%
  dplyr::select(
    MUTATION_CDS, MUTATION_AA,
    GENOME_START, GENOME_STOP,
    GENOMIC_WT_ALLELE, GENOMIC_MUT_ALLELE,
    GENOMIC_MUTATION_ID, MUTATION_DESCRIPTION
  ) %>%
  mutate(MUTATION_AA = str_replace(MUTATION_AA, "p.\\?", "")) %>%
  mutate(name = if_else(MUTATION_AA == "",
    paste0("g", GENOME_START, "_", MUTATION_CDS),
    paste0("g", GENOME_START, "_", MUTATION_CDS, " (", MUTATION_AA, ")")
  )) %>%
  dplyr::select(
    name, GENOME_START, GENOME_STOP,
    GENOMIC_WT_ALLELE, GENOMIC_MUT_ALLELE,
    GENOMIC_MUTATION_ID
  ) %>%
  set_names(c("name", "start", "stop", "ref", "alt", "cosmic_id")) %>%
  mutate(
    ref = if_else(is.na(ref), "", ref),
    alt = if_else(is.na(alt), "", alt)
  ) %>%
  mutate(var_len = pmax(str_length(ref), str_length(alt))) %>%
  filter(var_len <= 10) %>%
  dplyr::select(-var_len) %>%
  arrange(start)



cosmic_dt_subset_snv <-
  cosmic_dt_subset %>%
  filter(str_length(ref) == 1 & str_length(alt) == 1)

cosmic_dt_subset_not_snv <-
  cosmic_dt_subset %>%
  filter(!(str_length(ref) == 1 & str_length(alt) == 1))

cosmic_dt_subset_snv
```


```{r}
cosmic_dt_subset_not_snv <-
  cosmic_dt_subset_not_snv %>%
  mutate(type = case_when(
    (str_length(ref) == str_length(alt) & str_length(ref) == 1) ~ "snv",
    (str_length(ref) >= 1 & str_length(alt) == 0) ~ "del",
    (str_length(ref) == 0 & str_length(alt) >= 1) ~ "ins",
    TRUE ~ "indel"
  )) %>%
  relocate(type, .after = name)


cosmic_dt_subset_not_snv <-
  cosmic_dt_subset_not_snv %>%
  mutate(extra_base = map_chr(
    cosmic_dt_subset_not_snv$start,
    ~ as.character(subseq(hg_genome[["chr7"]], ., .))
  )) %>%
  mutate(
    stop = if_else(type == "ins", stop - 1, stop),
    ref = if_else(type == "ins", extra_base, ref),
    alt = if_else(type == "ins", paste0(extra_base, alt), alt)
  ) %>%
  dplyr::select(-extra_base)
cosmic_dt_subset_not_snv
```

```{r}
snv_dt <-
  full_join(cosmic_dt_subset_snv,
    clinvar_hg38_EGFR_snv,
    by = c("start", "stop", "ref", "alt")
  ) %>%
  mutate(name = if_else(is.na(name.x), name.y, name.x)) %>%
  dplyr::select(
    name, start, stop,
    ref, alt, cosmic_id,
    clinvar_id
  ) %>%
  arrange(start) %>%
  mutate(name = gsub("_c\\.\\d+[+-]?\\d+([[:alpha:]])>", "_\\1>", name))
snv_dt
```


## Load variants from base editor experiments

```{r}
ABE8e_all_dt <- read_csv(ABE8e_path)
BE4max_all_dt <- read_csv(BE4max_path)
ABE8e_dt <- ABE8e_all_dt %>%
  dplyr::select(c(14, 15, 17, 20, 27))
BE4max_dt <- BE4max_all_dt %>%
  dplyr::select(c(14, 15, 17, 20, 27))
base_editor_variants <-
  bind_rows(BE4max_dt, ABE8e_dt) %>%
  dplyr::filter(Mutation.category_simplified %in%
    c("Nonsense", "Missense", "Splice-acceptor", "Splice-donor"))
base_editor_variants <-
  base_editor_variants %>%
  separate_rows(Nucleotide.edits, sep = "C_") %>%
  separate_rows(Nucleotide.edits, sep = "A_") %>%
  separate_rows(Nucleotide.edits, sep = "_") %>%
  separate_rows(Nucleotide.edits, sep = ";") %>%
  dplyr::filter(Nucleotide.edits != "") %>%
  mutate(Nucleotide.edits = as.integer(Nucleotide.edits)) %>%
  mutate(start = if_else(sgRNA.Strand == "sense",
    sgrna.genomic.position + (Nucleotide.edits - 1),
    sgrna.genomic.position - (Nucleotide.edits - 1)
  )) %>%
  mutate(
    edit =
      case_when(
        Edit == "C-T" & sgRNA.Strand == "antisense" ~ "G-A",
        Edit == "A-G" & sgRNA.Strand == "antisense" ~ "T-C",
        TRUE ~ Edit
      )
  ) %>%
  dplyr::select(start, edit) %>%
  group_by(start) %>%
  dplyr::slice(1) %>%
  tidyr::separate(edit,
    into = c("ref", "alt"), sep = "-"
  ) %>%
  mutate(
    stop = start,
    name = paste0("g", start, "_", ref, ">", alt)
  ) %>%
  dplyr::select(c("name", "start", "stop", "ref", "alt"))
base_editor_variants
```


### Combine variants data

```{r}
snv_dt <-
  full_join(snv_dt,
    base_editor_variants,
    by = c("start", "stop", "ref", "alt")
  )

snv_dt <-
  snv_dt %>%
  mutate(name = if_else(is.na(name.x), name.y, name.x)) %>%
  rename(name.y = "be_id") %>%
  dplyr::select(
    name, start, stop, ref,
    alt, cosmic_id, clinvar_id, be_id
  ) %>%
  unite("id", cosmic_id:be_id, sep = ";", na.rm = TRUE) %>%
  arrange(start) %>%
  mutate(type = "snv")
snv_dt
```


```{r}
non_snv_dt <-
  full_join(clinvar_hg38_EGFR_not_snv,
    cosmic_dt_subset_not_snv,
    by = c("start", "stop", "ref", "alt", "type")
  ) %>%
  mutate(name = if_else(is.na(name.x), name.y, name.x)) %>%
  dplyr::select(
    name, type, start, stop,
    ref, alt, cosmic_id,
    id
  ) %>%
  arrange(start) %>%
  unite("id", cosmic_id:id, sep = ";", na.rm = TRUE)
non_snv_dt
```


```{r}
variants_dt <-
  bind_rows(
    snv_dt,
    non_snv_dt
  ) %>%
  arrange(start) %>%
  mutate(name = sub(" \\(p\\..*$", "", name))

variants_dt
```




## Annotate Variants


```{r}
annotate_and_predict_variants <- function(variants_dt) {
  seq_len_min <- 49

  num_variants <- NROW(variants_dt)
  gr0 <-
    GRanges(
      Rle(c("chr7"), num_variants),
      IRanges(
        start = variants_dt$start,
        end = variants_dt$stop,
        names = variants_dt$name
      )
    )

  var_allelles <- DNAStringSet(variants_dt$alt)

  intron_locations <-
    locateVariants(gr0,
      intbytx,
      IntronVariants(),
      varAllele = var_allelles
    )
  # Locate intronic variants
  intron_locations <-
    intron_locations %>%
    as_tibble() %>%
    add_column(name = names(intron_locations)) %>%
    relocate(name, .before = "seqnames") %>%
    dplyr::select(-PRECEDEID, -FOLLOWID, -TXID)

  # Locate remaining variants
  all <- locateVariants(gr0,
    txdb,
    AllVariants(),
    varAllele = var_allelles
  )
  all <-
    all %>%
    as_tibble() %>%
    add_column(name = names(all)) %>%
    relocate(name, .before = "seqnames") %>%
    dplyr::select(-PRECEDEID, -FOLLOWID)

  # Flag intronic and 5UTR variants that have alternative 
  # effects given other potential spliced transcripts

  transcript_splice_sites <-
    all %>%
    dplyr::filter(LOCATION == "spliceSite" & TXID == "93502")

  intron_locations <-
    intron_locations %>%
    filter(!(name %in% transcript_splice_sites$name))

  uncertain_introns <-
    all %>%
    dplyr::filter(name %in% intron_locations$name) %>%
    group_by(name, LOCATION) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    group_by(name) %>%
    mutate(frac_intron = sum(LOCATION == "intron") / n()) %>%
    dplyr::filter(frac_intron < 1) %>%
    dplyr::filter(LOCATION != "intron") %>%
    dplyr::filter(TXID != 93502) %>%
    arrange(QUERYID) %>%
    dplyr::select(-GENEID, -frac_intron) %>%
    ungroup()

  flag_variants <-
    uncertain_introns %>%
    dplyr::select(name, LOCATION, TXID) %>%
    left_join(., tx_lengths %>%
      dplyr::select(tx_id, tx_name) %>%
      mutate(tx_id = as.character(tx_id)) %>%
      dplyr::rename(TXID = "tx_id"),
    by = "TXID"
    ) %>%
    dplyr::select(-TXID) %>%
    unite("alt_effect", LOCATION:tx_name)


  all <-
    all %>%
    dplyr::filter(TXID == 93502) %>%
    dplyr::select(-TXID)

  potential_promoters <-
    all[which(duplicated(all$name)), ] %>%
    dplyr::select(name) %>%
    mutate(alt_effect = "promoter_ENST00000275493.7")

  flag_variants <-
    bind_rows(flag_variants, potential_promoters)

  flag_variants <-
    flag_variants %>%
    group_by(name) %>%
    dplyr::slice(1) %>%
    ungroup()


  all <-
    all %>%
    dplyr::filter(!duplicated(all$name))


  all <-
    bind_rows(
      all,
      intron_locations
    )

  all <-
    left_join(all, flag_variants, by = "name") %>%
    dplyr::select(-GENEID)


  all <-
    all %>%
    dplyr::select(-c(LOCSTART:CDSID))

  gene_model_gr_list <-
    as(gene_model_gr, "GRangesList")

  names(gene_model_gr_list) <- mcols(gene_model_gr)$id

  map_grs <-
    mapToTranscripts(resize(gr0, 1), gene_model_gr_list)

  map_dt <-
    map_grs %>%
    as_tibble() %>%
    mutate(name = names(map_grs)) %>%
    dplyr::select(name, seqnames, start) %>%
    set_names(c("name", "loc", "loc_start")) %>%
    left_join(., loc_lens, by = "loc")

  all <-
    left_join(all,
      map_dt,
      by = "name"
    )

  # Predict consequence of variant
  predict_coding_variants <-
    predictCoding(gr0, txdb, hg_genome, varAllele = var_allelles)
  predict_coding_variants <-
    predict_coding_variants[mcols(predict_coding_variants)$TXID == 93502]
  predict_coding_variants <-
    as_tibble(predict_coding_variants) %>%
    add_column(name = names(predict_coding_variants)) %>%
    dplyr::select(
      name, CDSLOC.start, CONSEQUENCE, REFCODON,
      VARCODON, REFAA, VARAA, PROTEINLOC
    )

  n_distinct(all$name)

  all <-
    left_join(all, predict_coding_variants, by = "name")



  all <-
    all %>%
    mutate(loc_end_comp = loc_len - (loc_start + width - 1)) %>%
    mutate(boundary_flag = (LOCATION == "coding" &
      (loc_start < seq_len_min | loc_end_comp < seq_len_min)))

  all <-
    all %>%
    dplyr::select(
      name, CONSEQUENCE, LOCATION, loc, loc_len, loc_start,
      CDSLOC.start, PROTEINLOC, REFCODON, VARCODON,
      REFAA, VARAA, boundary_flag, alt_effect
    ) %>%
    set_names(c(
      "name", "consequence", "location", "region", "region_len",
      "pos_region", "pos_cds", "pos_protein", "ref_codon",
      "var_codon", "ref_aa", "var_aa", "boundary_flag", "alt_effect"
    )) %>%
    rowwise() %>%
    mutate(pos_protein = str_c(pos_protein, collapse = ":"))

  return(all)
}
```


```{r}
all <-
  annotate_and_predict_variants(variants_dt)
```


## Include base editing variants generated by two or more edits

```{r}
min_base_changes <- function(target_amino_acid, ref_codon) {
  # Define a mapping of amino acids to codons
  amino_acid_mapping <- list(
    "A" = c("GCT", "GCC", "GCA", "GCG"),
    "R" = c("CGT", "CGC", "CGA", "CGG", "AGA", "AGG"),
    "N" = c("AAT", "AAC"),
    "D" = c("GAT", "GAC"),
    "C" = c("TGT", "TGC"),
    "Q" = c("CAA", "CAG"),
    "E" = c("GAA", "GAG"),
    "G" = c("GGT", "GGC", "GGA", "GGG"),
    "H" = c("CAT", "CAC"),
    "I" = c("ATA"), # "ATT", "ATC", two A>Ts
    "L" = c("TTA", "TTG", "CTT", "CTC", "CTA", "CTG"),
    "K" = c("AAA", "AAG"),
    "M" = c("ATG"),
    "F" = c("TTT", "TTC"),
    "P" = c("CCT", "CCC", "CCA", "CCG"),
    "S" = c("TCT", "TCC", "TCA", "TCG", "AGT", "AGC"),
    "T" = c("ACT", "ACC", "ACA", "ACG"),
    "W" = c("TGG"),
    "Y" = c("TAT", "TAC"),
    "V" = c("GTT", "GTC", "GTA", "GTG"),
    "*" = c("TAA", "TAG", "TGA")
  )

  # Find the reference amino acid corresponding to the reference codon
  ref_amino_acid <-
    names(amino_acid_mapping)[which(ref_codon %in% unlist(amino_acid_mapping))]

  # Check if the reference amino acid is valid
  if (is.null(ref_amino_acid)) {
    stop("Invalid reference codon.")
  }

  # Check if the target amino acid is valid
  if (!(target_amino_acid %in% names(amino_acid_mapping))) {
    stop("Invalid target amino acid.")
  }

  # Find the target codons for the given amino acid
  target_codons <- amino_acid_mapping[[target_amino_acid]]

  # Calculate the number of base changes needed for each target codon
  base_changes <-
    sapply(
      target_codons,
      function(tc) sum(strsplit(ref_codon, "")[[1]] != strsplit(tc, "")[[1]])
    )

  # Find the index of the minimum base changes
  min_index <-
    which.min(base_changes)

  # Return the variant codon with minimum base changes
  return(target_codons[min_index])
}

get_codon_at_aa_position <- function(pos) {
  start_pos <- pos * 3 - 2
  end_pos <- pos * 3
  as.character(subseq(EGFR_seq, start_pos, end_pos))
}


ABE8e_aa_var <-
  ABE8e_all_dt %>%
  dplyr::select(Amino.acid.edits) %>%
  separate_rows(Amino.acid.edits, sep = ";") %>%
  dplyr::filter(!(Amino.acid.edits %in% c("utr", "None", ""))) %>%
  dplyr::filter(!str_detect(Amino.acid.edits, "Exon")) %>%
  separate_wider_regex(Amino.acid.edits,
    c(
      ref_aa = "^[A-Za-z]+",
      pos_protein = "[0-9]+",
      var_aa = "[A-Za-z]+$"
    ),
    cols_remove = F
  ) %>%
  dplyr::filter(ref_aa != var_aa)


BE4max_aa_var <-
  BE4max_all_dt %>%
  dplyr::select(Amino.acid.edits) %>%
  separate_rows(Amino.acid.edits, sep = ";") %>%
  dplyr::filter(!(Amino.acid.edits %in% c("utr", "None", ""))) %>%
  dplyr::filter(!str_detect(Amino.acid.edits, "Exon")) %>%
  separate_wider_regex(Amino.acid.edits,
    c(
      ref_aa = "^[A-Za-z]+",
      pos_protein = "[0-9]+",
      var_aa = "[A-Za-z]+$"
    ),
    cols_remove = F
  ) %>%
  dplyr::filter(ref_aa != var_aa)

aa_map <- c(AMINO_ACID_CODE, "*" = "Ter")
flipped_aa_map <- setNames(names(aa_map), aa_map)

be_var_dt <-
  bind_rows(ABE8e_aa_var, BE4max_aa_var) %>%
  group_by(Amino.acid.edits) %>%
  dplyr::slice(1) %>%
  ungroup() %>%
  mutate(
    ref_aa = flipped_aa_map[ref_aa],
    var_aa = flipped_aa_map[var_aa]
  )

variants_dt0 <-
  left_join(variants_dt, all, by = "name")

be_var_dt <-
  left_join(be_var_dt,
    variants_dt0 %>%
      filter(pos_protein != "" & type == "snv") %>%
      dplyr::select(name, ref_aa, pos_protein, var_aa),
    by = c("ref_aa", "pos_protein", "var_aa")
  ) %>%
  dplyr::filter(is.na(name)) %>%
  mutate(pos_protein = as.numeric(pos_protein)) %>%
  rowwise() %>%
  mutate(ref_codon = get_codon_at_aa_position(pos_protein))

be_var_dt <-
  be_var_dt %>%
  mutate(var_codon = min_base_changes(var_aa, ref_codon))

be_var_dt <-
  be_var_dt %>%
  mutate(
    last_base_diff =
      str_sub(ref_codon, 3, 3) != str_sub(var_codon, 3, 3)
  ) %>%
  mutate(
    ref = if_else(last_base_diff, ref_codon, str_sub(ref_codon, 1, 2)),
    alt = if_else(last_base_diff, var_codon, str_sub(var_codon, 1, 2)),
    start = ref_locs[pos_protein * 3 - 2],
    stop = if_else(last_base_diff, start + 2, start + 1)
  ) %>%
  mutate(name = paste0("g", start, "_", ref, ">", alt)) %>%
  mutate(
    id = name,
    type = "indel"
  ) %>%
  dplyr::select(c("name", "type", "start", "stop", "ref", "alt", "id"))
rm(variants_dt0)

# remove g55174776_TT>CC since it is the same as g55174776_c.2239_2240delinsCC
be_var_dt <-
  be_var_dt %>%
  filter(name != "g55174776_TT>CC")
be_var_dt
```




### Include double base edits variants

```{r}
variants_dt <-
  bind_rows(variants_dt, be_var_dt) %>%
  arrange(start)
```


### Reannotate after inclusion of double base edits variants

```{r}
all <-
  annotate_and_predict_variants(variants_dt)

variants_dt <-
  left_join(variants_dt, all, by = "name")

variants_dt
```
```{r}
table(variants_dt$location)
```

```{r}
table(variants_dt$location, variants_dt$type)
```


### Filter out introns


```{r}
variants_dt <-
  variants_dt %>%
  filter(location != c("intron"))
variants_dt
```



## Generate synonymous mutations near target Single Nucleotide Variants


```{r}
# Function to split a string into consecutive triplets
split_into_triplets <- function(s) {
  n <- nchar(s)
  if (n %% 3 != 0) {
    warning("String length is not a multiple of 3. Padding with spaces.")
    s <- paste0(s, rep(" ", 3 - (n %% 3)))
  }
  matrix(strsplit(s, "")[[1]], ncol = 3, byrow = TRUE)
}

# Split the vector into triplets
triplets <- split_into_triplets(as.character(EGFR_seq))

# Create a dataframe
codon_map_df <-
  tibble(ref_codon = apply(triplets, 1, paste, collapse = "")) %>%
  mutate(pos_protein = row_number())
codon_map_df
```


```{r}
syn_codon_map <-
  read_delim(syn_codon_path)
syn_codon_map
```

```{r}
codon_map_df <-
  left_join(codon_map_df, syn_codon_map, by = "ref_codon") %>%
  mutate(start = ref_locs[pos_protein * 3]) %>%
  filter(!is.na(ref_aa))
codon_map_df
```



### Split SNVS into spliceSite coding, and intron variants

```{r}
variants_snv <-
  variants_dt %>%
  filter(type == "snv")

variants_snv_coding <-
  variants_snv %>%
  filter(location %in% c("coding"))
```

### Find nonsynomous mutations

```{r}
get_codon_map <- function(start, name, num_idx) {
  dis <- abs(start - codon_map_df$start)
  dis[dis <= 2] <- 10000
  lowest_indices <- head(order(dis), num_idx)

  bp_dist <- codon_map_df$start[lowest_indices] - start

  dt <- tibble(name = name, dist = bp_dist)
  dt <- bind_cols(dt, codon_map_df[lowest_indices, ])
  return(dt)
}

coding_syn <-
  map2_df(variants_snv_coding$start,
    variants_snv_coding$name,
    get_codon_map,
    num_idx = 1
  ) %>%
  rename(name = "target_name") %>%
  mutate(name = paste0("g", start, "_", ref, ">", alt)) %>%
  dplyr::select(
    name, start, ref, alt, ref,
    target_name, pos_protein,
    ref_codon, var_codon, ref_aa, dist
  )
```

```{r}
table(coding_syn$dist)
```

```{r}
coding_syn <-
  coding_syn %>%
  dplyr::select(-dist) %>%
  group_by(name) %>%
  mutate(target_name = paste(target_name, collapse = ";")) %>%
  dplyr::slice(1)
coding_syn
```


```{r}
## Previous code for including intronic variants!!

# variants_snv_intron <-
#   variants_snv %>%
#   filter(location %in% c("intron", "spliceSite"))

# mut_pos_dist <- 3
# intron_syn <-
#   variants_snv_intron %>%
#   dplyr::select(name, start, region_len, pos_region) %>%
#   mutate(pos_region_comp = region_len - pos_region) %>%
#   mutate(upstream = case_when(
#     pos_region_comp < 8 ~ F,
#     pos_region_comp >= 8 ~ T
#   )) %>%
#   mutate(
#     start_syn = if_else(upstream, start + mut_pos_dist, start - mut_pos_dist),
#     ref = map_chr(
#       start_syn,
#       ~ as.character(subseq(hg_genome[["chr7"]], ., .))
#     )
#   )
#

# dup_alternatives <-
#   intron_syn %>%
#   filter(start %in% intron_syn$start_syn) %>%
#   separate(name, into = c("edit", "alt0"), remove = F, sep = ">") %>%
#   separate(edit, into = c("loc", "ref0"), remove = F, sep = "_") %>%
#   dplyr::select(name, start, ref0, alt0) %>%
#   group_by(start) %>%
#   mutate(bases = paste(alt0, collapse = ",")) %>%
#   ungroup() %>%
#   mutate(bases = paste0(bases, ",", ref0)) %>%
#   rowwise() %>%
#   mutate(alt_2 = setdiff(c("A", "C", "T", "G"),
#                          str_split(bases, pattern = ",")[[1]])[1]) %>%
#   mutate(name_sym = paste0("g", start, "_", ref0, ">", alt_2)) %>%
#   dplyr::select(name, name_sym, alt_2)
# dup_alternatives

# intron_syn <-
#   intron_syn %>%
#   mutate(alt = case_when(
#     ref == "G" ~ "A",
#     ref == "A" ~ "G",
#     ref == "C" ~ "T",
#     ref == "T" ~ "C"
#   )) %>%
#   dplyr::select(name, start_syn, ref, alt) %>%
#   dplyr::rename(target_name = "name", start = "start_syn") %>%
#   mutate(name = paste0("g", start, "_", ref, ">", alt)) %>%
#   dplyr::select(name, start, ref, alt, target_name) %>%
#   group_by(name) %>%
#   mutate(target_name = paste(target_name, collapse = ";")) %>%
#   dplyr::slice(1)


# intron_syn <-
#   left_join(intron_syn,
#     dup_alternatives,
#     by = "name"
#   ) %>%
#   mutate(
#     name = if_else(is.na(name_sym), name, name_sym),
#     alt = if_else(is.na(alt_2), alt, alt_2)
#   ) %>%
#   dplyr::select(-name_sym, -alt_2)

# syn_variants <-
#   bind_rows(
#     coding_syn %>%
#       mutate(location = "coding"),
#     intron_syn %>%
#       mutate(
#         pos_protein = NA,
#         ref_codon = NA,
#         var_codon = NA,
#         ref_aa = NA,
#         location = "intron"
#       )
#   )
# syn_variants
```


```{r}
syn_variants <-
  left_join(
    coding_syn %>%
      mutate(location = "coding"),
    variants_snv %>%
      dplyr::select(name, id),
    by = "name"
  ) %>%
  mutate(
    syn_in_target = name %in% variants_dt$name,
    num_targets = str_count(target_name, ";") + 1
  )
syn_variants
```

```{r}
syn_target_map <-
  syn_variants %>%
  group_by(num_targets, syn_in_target) %>%
  tally()
syn_target_map
```



```{r}
syn_variants_long <-
  syn_variants %>%
  separate_rows(target_name, sep = ";") %>%
  dplyr::select(
    target_name, name, start, ref,
    alt
  ) %>%
  set_names(c(
    "name", "name_syn", "start_syn",
    "ref_syn", "alt_syn"
  ))
syn_variants_long
```


```{r}
variants_dt <-
  left_join(variants_dt, syn_variants_long, by = "name")
variants_dt
```

```{r}
variants_dt <-
  variants_dt %>%
  mutate(target_in_syn = if_else(name %in% syn_variants$name, T, F)) %>%
  mutate(name_target_syn = paste0(name, ":", name_syn)) %>%
  mutate(
    clinvar = if_else(grepl("\\b\\d+\\b", id), T, F),
    cosmic = if_else(grepl("COSV", id), T, F),
    base_edit = if_else(grepl("g\\d+_[A-Z]+>[A-Z]+", id), T, F)
  ) %>%
  dplyr::select(
    name, id, ref, alt, ref_aa, var_aa, pos_protein,
    type, consequence, location, name_syn, name_target_syn,
    target_in_syn, alt_effect,
    clinvar, cosmic, base_edit, start, stop, start_syn, everything()
  )  %>%
  mutate(name_target_syn=if_else(is.na(name_syn), NA, name_target_syn)) 
variants_dt
```




### Split variants into target, target_syn, and syn



```{r}
variants_snv <-
  variants_dt %>%
  filter(!is.na(name_syn))  


target_syn_variants <-
  variants_snv  %>%
  dplyr::select(name_target_syn, name_syn, name) %>%
  set_names(c("target_name","name_syn", "name_target"))  %>%
  mutate( name_target_syn=NA,
          num_targets=NA,
          paired_targets_as_syn= NA, 
          paired_targets_syn_as_syn= NA) %>%
  dplyr::select(target_name, name_target, name_syn, 
                name_target_syn,
                num_targets, paired_targets_as_syn,
                paired_targets_syn_as_syn )
target_syn_variants

```

```{r}
syn_variants_dt <-
  variants_snv %>%
  dplyr::select(name_syn, name, name_target_syn) %>%
  set_names(c("target_name", "name_target", "name_target_syn" ))  %>%

  group_by(target_name)%>% 
  mutate(name_target = paste(name_target, collapse = ";"),
         name_target_syn = paste(name_target_syn, collapse = ";")) %>% 
  dplyr::slice(1)    %>% 
  mutate( 
          syn_in_target = target_name %in% variants_dt$name,
          num_targets = str_count(name_target, ";") + 1
  )


syn_variants_dt_filtered <-
  syn_variants_dt %>%
  mutate(target_in_syn=NA,
        name_syn=NA,
        paired_targets_as_syn= NA, 
        paired_targets_syn_as_syn= NA) %>%
  filter(!(syn_in_target))%>%
  dplyr::select(target_name, name_target, name_syn,
                name_target_syn,
                num_targets, 
                paired_targets_as_syn, 
                paired_targets_syn_as_syn )

syn_variants_in_target <-
    syn_variants_dt %>%
  filter((syn_in_target)) %>%
  dplyr::select(-syn_in_target) %>%
  rename(name_target="paired_targets_as_syn",
         name_target_syn="paired_targets_syn_as_syn")
syn_variants_dt_filtered
```



```{r}
variants_dt <-
  left_join(variants_dt, 
          syn_variants_in_target %>%
  rename(target_name="name"),
  by="name")
variants_dt
```

```{r}
target_variants <-
  variants_dt %>%
  dplyr::select(name, name_syn, name_target_syn, paired_targets_as_syn, paired_targets_syn_as_syn, num_targets) %>%
  set_names(c("target_name", "name_syn", "name_target_syn", "paired_targets_as_syn", "paired_targets_syn_as_syn", "num_targets"))  %>%
  mutate(name_target=NA)  %>%
  dplyr::select(target_name, name_target, name_syn,
                name_target_syn,
                num_targets, 
                paired_targets_as_syn, 
                paired_targets_syn_as_syn )
  
target_variants
```


```{r}
all_variants_paired <- 
  bind_rows(target_variants, 
            target_syn_variants, 
            syn_variants_dt_filtered)
all_variants_paired 
```



### Save to files

```{r}
write_tsv(
  all_variants_paired,
  all_variants_paired_path
)

write_tsv(
  variants_dt, variants_table_path)
```


```{r}
sessionInfo()
```
